require(devtools)
install_version("httr", version = "0.2", repos = "http://cran.us.r-project.org")
library(httr)
library(plyr)
library(XML)
library(dplyr) 
library("reshape2")

setwd("/Users/zhangweicen/Desktop/3rd semester/thesis")
# sign up for the article search API here
# http://developer.nytimes.com/docs/read/article_search_api_v2
# and enter your key below
API_KEY <- '1139460898e6ffbb9d46e5a6cceb0d1e:2:71845984'

NYT_ARTICLE_SEARCH <- 'http://api.nytimes.com/svc/search/v2/articlesearch.json'

html2txt <- function(str) {
  txt <- xpathApply(htmlParse(str, asText=TRUE),
                    "//body//text()",
                    xmlValue)[[1]]
  txt <- gsub('\n','', txt)
}

get_nyt_articles_by_section <- function(section,begin_date,end_date) {
  # number of API pages to fetch
  k=1
  articles <- data.frame()
  repeat{    
    # construct query parameters
    query <- list('api-key'= API_KEY,
                  fq=sprintf('section_name:%s', section),
                  page=k,
                  begin_date=begin_date,
                  end_date=end_date)
    # fetch results
    response <- GET(NYT_ARTICLE_SEARCH, query=query)
    result <- content(response)
    output = data.frame()
    for(i in 1:length(result$response$docs)){
      try({
        doc = result$response$docs[[i]]
        temp = data.frame(section=doc$section_name,
                          date=doc$pub_date,
                          url=gsub('\\/','/', doc$web_url),
                          headline=html2txt(doc$headline),
                          snippet=html2txt(doc$snippet))
        output = rbind(output,temp)       
      })
    }
    data=output
    # append to final data frame
    articles <- rbind(articles, data)
    k=k+1
    if(length(data)==0){
      break
    }
  }
  
  # return all results
  articles
}

business=data.frame()
year.range=c(2008:2012)
date.end.month <- seq(as.Date("2008-02-01"),length=12*length(year.range),by="months")-1
date.start.month<-seq(as.Date("2008-01-01"),length=12*length(year.range),by="months")
date.end.month =rev(as.character(date.end.month, "%Y%m%d"))
date.start.month =rev(as.character(date.start.month, "%Y%m%d")) 
for (i in 1:length(date.end.month)){ 
  temp<- get_nyt_articles_by_section("Business", date.start.month[i],date.end.month[i])
  business=rbind(business,temp)
  i=i+1
}

######remove observation with same url
business <- business[!duplicated(business$url),]
###############write table
write.table(business,"business.csv",sep=",")